#!pip install xgboost
#!pip install shap
#!pip install lime
import os
import warnings
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.offline as pyo
import plotly.io as pio
pio.renderers.default = "notebook_connected"
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
import statsmodels.formula.api as smf
from scipy.interpolate import interp1d
from scipy.stats import ttest_ind, pearsonr
from sklearn.preprocessing import FunctionTransformer
from sklearn.model_selection import train_test_split, GroupShuffleSplit, cross_val_score, GridSearchCV, StratifiedGroupKFold
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.preprocessing import StandardScaler, LabelEncoder, RobustScaler
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.metrics import (r2_score, mean_squared_error, accuracy_score, precision_score,
f1_score, classification_report, ConfusionMatrixDisplay, confusion_matrix)
import shap
from shap import summary_plot
import lime
import lime.lime_tabular
#%matplotlib inline
warnings.filterwarnings("ignore")
This section ensures the dataset is correctly loaded and clean before moving to analysis.
We perform the following checks:
Negative values are flagged because features like 'Jitter', 'Shimmer', and clinical scores should not be negative. These values may result from sensor noise, data entry issues, or corruption.
If the number of negative rows is very small (less than 1% of total), we remove them to improve data integrity without affecting dataset balance.
def load_and_validate_data(file_path):
# 1. Check if file exists
if not os.path.exists(file_path):
raise FileNotFoundError(f"{file_path} not found.")
# 2. Load data
df = pd.read_csv(file_path)
print("=== File loaded successfully ===")
# 3. Display head
print("\n=== First 5 rows ===")
display(df.head())
# 4. Check for missing values
print("\n=== Missing values ===")
display(df.isnull().sum())
# 5. Descriptive statistics
print("\n=== Descriptive statistics ===")
display(df.describe())
# 6. Check for duplicate rows
print("\n=== Checking for duplicate rows ===")
duplicate_rows = df[df.duplicated()]
print(f"Total duplicate rows: {len(duplicate_rows)}")
if not duplicate_rows.empty:
display(duplicate_rows.head())
# 8. Value range checks
assert df['age'].between(30, 100).all(), "Unexpected values in 'age' column"
assert df['sex'].isin([0, 1]).all(), "Invalid values in 'sex' column"
# 7. Checking for multiple signal readings from one telemonitoring session
print("\n=== Multiple measures check ===\n")
dup_check = df.groupby(['subject#', 'test_time']).size()
true_duplicates = dup_check[dup_check > 1]
print(f"Multiple measures at the same time: {len(true_duplicates)}")
# 8. Check and remove negative values (if negligible)
numeric_cols = df.select_dtypes(include=np.number).columns
negative_mask = (df[numeric_cols] < 0)
negative_counts = negative_mask.sum()
total_negatives = negative_mask.values.sum()
print("\n=== Negative value counts ===")
display(negative_counts[negative_counts > 0])
if total_negatives / df.shape[0] < 0.01:
df = df[~negative_mask.any(axis=1)]
print(f"\nDropped {total_negatives} rows with negative values (negligible share).")
print("\n=== Validation complete ===\n\n")
return df
# === Load and validate data ===
DATA_FILE = "parkinsons_updrs.data" # Change to your actual filename
df = load_and_validate_data(DATA_FILE)
=== File loaded successfully === === First 5 rows ===
| subject# | age | sex | test_time | motor_UPDRS | total_UPDRS | Jitter(%) | Jitter(Abs) | Jitter:RAP | Jitter:PPQ5 | ... | Shimmer(dB) | Shimmer:APQ3 | Shimmer:APQ5 | Shimmer:APQ11 | Shimmer:DDA | NHR | HNR | RPDE | DFA | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1 | 72 | 0 | 5.6431 | 28.199 | 34.398 | 0.00662 | 0.000034 | 0.00401 | 0.00317 | ... | 0.230 | 0.01438 | 0.01309 | 0.01662 | 0.04314 | 0.014290 | 21.640 | 0.41888 | 0.54842 | 0.16006 |
| 1 | 1 | 72 | 0 | 12.6660 | 28.447 | 34.894 | 0.00300 | 0.000017 | 0.00132 | 0.00150 | ... | 0.179 | 0.00994 | 0.01072 | 0.01689 | 0.02982 | 0.011112 | 27.183 | 0.43493 | 0.56477 | 0.10810 |
| 2 | 1 | 72 | 0 | 19.6810 | 28.695 | 35.389 | 0.00481 | 0.000025 | 0.00205 | 0.00208 | ... | 0.181 | 0.00734 | 0.00844 | 0.01458 | 0.02202 | 0.020220 | 23.047 | 0.46222 | 0.54405 | 0.21014 |
| 3 | 1 | 72 | 0 | 25.6470 | 28.905 | 35.810 | 0.00528 | 0.000027 | 0.00191 | 0.00264 | ... | 0.327 | 0.01106 | 0.01265 | 0.01963 | 0.03317 | 0.027837 | 24.445 | 0.48730 | 0.57794 | 0.33277 |
| 4 | 1 | 72 | 0 | 33.6420 | 29.187 | 36.375 | 0.00335 | 0.000020 | 0.00093 | 0.00130 | ... | 0.176 | 0.00679 | 0.00929 | 0.01819 | 0.02036 | 0.011625 | 26.126 | 0.47188 | 0.56122 | 0.19361 |
5 rows × 22 columns
=== Missing values ===
subject# 0 age 0 sex 0 test_time 0 motor_UPDRS 0 total_UPDRS 0 Jitter(%) 0 Jitter(Abs) 0 Jitter:RAP 0 Jitter:PPQ5 0 Jitter:DDP 0 Shimmer 0 Shimmer(dB) 0 Shimmer:APQ3 0 Shimmer:APQ5 0 Shimmer:APQ11 0 Shimmer:DDA 0 NHR 0 HNR 0 RPDE 0 DFA 0 PPE 0 dtype: int64
=== Descriptive statistics ===
| subject# | age | sex | test_time | motor_UPDRS | total_UPDRS | Jitter(%) | Jitter(Abs) | Jitter:RAP | Jitter:PPQ5 | ... | Shimmer(dB) | Shimmer:APQ3 | Shimmer:APQ5 | Shimmer:APQ11 | Shimmer:DDA | NHR | HNR | RPDE | DFA | PPE | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| count | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | ... | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 | 5875.000000 |
| mean | 21.494128 | 64.804936 | 0.317787 | 92.863722 | 21.296229 | 29.018942 | 0.006154 | 0.000044 | 0.002987 | 0.003277 | ... | 0.310960 | 0.017156 | 0.020144 | 0.027481 | 0.051467 | 0.032120 | 21.679495 | 0.541473 | 0.653240 | 0.219589 |
| std | 12.372279 | 8.821524 | 0.465656 | 53.445602 | 8.129282 | 10.700283 | 0.005624 | 0.000036 | 0.003124 | 0.003732 | ... | 0.230254 | 0.013237 | 0.016664 | 0.019986 | 0.039711 | 0.059692 | 4.291096 | 0.100986 | 0.070902 | 0.091498 |
| min | 1.000000 | 36.000000 | 0.000000 | -4.262500 | 5.037700 | 7.000000 | 0.000830 | 0.000002 | 0.000330 | 0.000430 | ... | 0.026000 | 0.001610 | 0.001940 | 0.002490 | 0.004840 | 0.000286 | 1.659000 | 0.151020 | 0.514040 | 0.021983 |
| 25% | 10.000000 | 58.000000 | 0.000000 | 46.847500 | 15.000000 | 21.371000 | 0.003580 | 0.000022 | 0.001580 | 0.001820 | ... | 0.175000 | 0.009280 | 0.010790 | 0.015665 | 0.027830 | 0.010955 | 19.406000 | 0.469785 | 0.596180 | 0.156340 |
| 50% | 22.000000 | 65.000000 | 0.000000 | 91.523000 | 20.871000 | 27.576000 | 0.004900 | 0.000035 | 0.002250 | 0.002490 | ... | 0.253000 | 0.013700 | 0.015940 | 0.022710 | 0.041110 | 0.018448 | 21.920000 | 0.542250 | 0.643600 | 0.205500 |
| 75% | 33.000000 | 72.000000 | 1.000000 | 138.445000 | 27.596500 | 36.399000 | 0.006800 | 0.000053 | 0.003290 | 0.003460 | ... | 0.365000 | 0.020575 | 0.023755 | 0.032715 | 0.061735 | 0.031463 | 24.444000 | 0.614045 | 0.711335 | 0.264490 |
| max | 42.000000 | 85.000000 | 1.000000 | 215.490000 | 39.511000 | 54.992000 | 0.099990 | 0.000446 | 0.057540 | 0.069560 | ... | 2.107000 | 0.162670 | 0.167020 | 0.275460 | 0.488020 | 0.748260 | 37.875000 | 0.966080 | 0.865600 | 0.731730 |
8 rows × 22 columns
=== Checking for duplicate rows === Total duplicate rows: 0 === Multiple measures check === Multiple measures at the same time: 1530 === Negative value counts ===
test_time 12 dtype: int64
Dropped 12 rows with negative values (negligible share). === Validation complete ===
We split the dataset into training and test sets using 'GroupShuffleSplit' to ensure no subject appears in both sets.
X = df.drop(columns=['motor_UPDRS', 'total_UPDRS'])
y = df["motor_UPDRS"] # or "total_UPDRS"
groups = df["subject#"]
gss = GroupShuffleSplit(n_splits=1, test_size=0.2, random_state=42)
train_idx, test_idx = next(gss.split(X, y, groups=groups))
X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]
Histograms are used to observe the distribution and spread of numerical features in the dataset.
print("\n Plotting feature distributions...")
X_train.drop(columns=['subject#']).hist(figsize=(15, 12), bins=30, color='skyblue', edgecolor='black')
plt.suptitle("Feature Distributions", fontsize=16)
plt.tight_layout()
plt.show()
Plotting feature distributions...
Correlation heatmap is used to identify relationships between features and detect potential multicollinearity.
# Combine features and target for correlation
train_data = X_train.copy()
train_data["motor_UPDRS"] = df["motor_UPDRS"]
# Compute correlation matrix
corr_matrix = train_data.drop(columns=['subject#']).corr()
# Plot correlation heatmap
plt.figure(figsize=(14, 10))
ax = sns.heatmap(
corr_matrix,
annot=True,
fmt=".2f",
cmap="coolwarm",
center=0,
square=True,
cbar_kws={
'label': 'Correlation Strength',
'shrink': 0.9,
'ticks': [-1, -0.5, 0, 0.5, 1]
}
)
# Label colorbar meaning
colorbar = ax.collections[0].colorbar
colorbar.ax.set_yticklabels([
'-1: Strong Negative',
'-0.5: Moderate Negative',
'0: No Correlation',
'+0.5: Moderate Positive',
'+1: Strong Positive'
])
plt.title("Correlation Heatmap", fontsize=16)
plt.tight_layout()
plt.show()
Violin plot is used to visualize the distribution and density of a feature across severity classes.
It helps reveal differences in value spread, central tendency, and potential class-separating patterns.
# Create Age Groups
train_data["age_group"] = pd.cut(train_data["age"], bins=[0, 50, 60, 70, 80, 100], labels=["<50", "50-60", "60-70", "70-80", "80+"])
# Motor UPDRS by Sex
plt.figure(figsize=(8, 5))
sns.violinplot(data=train_data, x="sex", y="motor_UPDRS")
plt.title("Motor UPDRS by Sex")
plt.xticks([0, 1], ['Male', 'Female'])
plt.tight_layout()
plt.show()
# Motor UPDRS by Age Group
plt.figure(figsize=(10, 5))
sns.violinplot(data=train_data, x="age_group", y="motor_UPDRS", palette="Set2")
plt.title("Motor UPDRS by Age Group")
plt.tight_layout()
plt.show()
Avg UPDRS over Time per Age Group (with labeled colorbar)
While overall distributions give a general view, class-wise plots (e.g., violin/histogram by severity) help identify features that distinguish between classes for classification.
# Bin test_time into 10 equal-width bins
train_data["time_bin"] = pd.cut(train_data["test_time"], bins=10)
# Group and compute mean motor_UPDRS
heatmap_data = train_data.groupby(["age_group", "time_bin"])["motor_UPDRS"].mean().reset_index()
# Pivot for heatmap
heatmap_pivot = heatmap_data.pivot(index="age_group", columns="time_bin", values="motor_UPDRS")
# Plot heatmap
plt.figure(figsize=(10, 6))
ax = sns.heatmap(heatmap_pivot, cmap="YlOrRd", annot=True, fmt=".1f", cbar_kws={
'label': 'Average Motor UPDRS',
'shrink': 0.9
})
# Customize colorbar tick labels
colorbar = ax.collections[0].colorbar
colorbar.ax.set_yticklabels([f"{v:.0f}" for v in colorbar.get_ticks()]) # Rounded values
plt.title("Mean Motor UPDRS Over Time by Age Group ")
plt.xlabel("Test Time Bin")
plt.ylabel("Age Group")
plt.tight_layout()
plt.show()
# 5. RAPID PROGRESSION DETECTION – 30-days WINDOW-BASED
train_subjects = train_data["subject#"].unique()
# Define progression threshold
fast_motor_progression_per_year = 4.6
window_days = 30
motor_threshold = fast_motor_progression_per_year / 365 * window_days
top_n = 10
def is_recently_decreasing(group, col, tail_days=30):
recent = group[group["test_time"] >= (group["test_time"].max() - tail_days)]
return recent[col].iloc[-1] < recent[col].iloc[0]
filtered_subjects = []
for subject in train_subjects:
group = train_data[train_data["subject#"] == subject].sort_values("test_time")
if not is_recently_decreasing(group, "motor_UPDRS"):
filtered_subjects.append(subject)
def has_rapid_progression(group, col, threshold):
times = group["test_time"].values
values = group[col].values
for i in range(len(times)):
for j in range(i + 1, len(times)):
delta_days = times[j] - times[i]
if window_days - 2 < delta_days <= window_days + 2:
delta_val = values[j] - values[i]
if delta_val > threshold:
return True
return False
rapid_motor_progressors = []
for subject in filtered_subjects:
group = train_data[train_data["subject#"] == subject].sort_values("test_time")
if has_rapid_progression(group, "motor_UPDRS", motor_threshold):
rapid_motor_progressors.append(subject)
def compute_slope(x, y):
x = np.array(x).reshape(-1, 1)
y = np.array(y)
if len(x) < 2:
return 0
return LinearRegression().fit(x, y).coef_[0]
motor_slopes = {
subject: compute_slope(train_data[train_data["subject#"] == subject]["test_time"],
train_data[train_data["subject#"] == subject]["motor_UPDRS"])
for subject in rapid_motor_progressors
}
top_motor_subjects = sorted(motor_slopes, key=motor_slopes.get, reverse=True)[:top_n]
time_range = train_data.groupby("subject#")["test_time"].apply(lambda x: x.max() - x.min())
print(f"\nTime range (min–max): {time_range.min():.2f} – {time_range.max():.2f} days")
print(f"30-day Motor UPDRS Threshold: {motor_threshold:.2f}")
print(f"Top {top_n} Rapid Motor Progressors: {top_motor_subjects}\n")
Time range (min–max): 153.01 – 203.99 days 30-day Motor UPDRS Threshold: 0.38 Top 10 Rapid Motor Progressors: [37, 13, 21, 25, 39, 1, 4, 2, 33, 10]
We use Plotly to visualize the motor_UPDRS progression over time for the top N fastest progressors.
Each line represents a subject, showing how quickly their scores are rising across test days.
This interactive plot helps validate the rapid progression detection and provides clear insight into patient deterioration trends.
fig = go.Figure()
for subject in top_motor_subjects:
patient_data = train_data[train_data["subject#"] == subject].sort_values("test_time")
fig.add_trace(go.Scatter(
x=patient_data["test_time"],
y=patient_data["motor_UPDRS"],
mode='lines+markers',
name=f"Subject {subject}",
hoverinfo="text",
text=[f"Subject: {subject}<br>Time: {t:.1f}<br>Motor UPDRS: {u:.2f}" for t, u in zip(patient_data["test_time"], patient_data["motor_UPDRS"])]
))
fig.update_layout(
title=f"Top {top_n} Motor UPDRS Progressors (30-Day Spike, Training Set)",
xaxis_title="Test Time (Days)",
yaxis_title="Motor UPDRS",
hovermode="closest",
template="plotly_white",
width=1000, height=600
)
fig.show()
# Save interactive plot as HTML file (fully self-contained)
#pyo.plot(fig, filename='top_10_updrs_progressors.html', auto_open=True)
This smooths out short-term fluctuations in symptom scores and helps highlight the underlying progression trend more clearly.
Randomly selected 5 subjects from the training set and plot their rolling average motor_UPDRS over time.
# Step 1: Compute rolling average per subject
rolling_df = []
for subject, group in train_data.groupby("subject#"):
group_sorted = group.sort_values("test_time").copy()
group_sorted["motor_UPDRS_rollmean"] = group_sorted["motor_UPDRS"].rolling(window=3, min_periods=1).mean()
rolling_df.append(group_sorted)
df_rolling = pd.concat(rolling_df)
# Step 2: Plot rolling average for 5 random subjects in training set
sample_subjects = train_data["subject#"].drop_duplicates().sample(5, random_state=42)
plt.figure(figsize=(12, 6))
for subj in sample_subjects:
temp = df_rolling[df_rolling["subject#"] == subj]
plt.plot(temp["test_time"], temp["motor_UPDRS_rollmean"], label=f"Subject {subj}")
plt.title("Rolling Average of Motor UPDRS (Window=3)")
plt.xlabel("Test Time (Days)")
plt.ylabel("Motor UPDRS (Rolling Mean)")
plt.legend()
plt.tight_layout()
plt.show()
delta_motor = []
# Loop through each subject in training data
for subject, group in train_data.groupby("subject#"):
group = group.sort_values("test_time").reset_index(drop=True)
for i in range(len(group)):
for j in range(i + 1, len(group)):
delta_t = group.loc[j, "test_time"] - group.loc[i, "test_time"]
if 28 <= delta_t <= 32:
delta_motor.append(group.loc[j, "motor_UPDRS"] - group.loc[i, "motor_UPDRS"])
break # Only take the first 30-day window delta for each i
# Plot histogram of delta changes
import matplotlib.pyplot as plt
plt.figure(figsize=(12, 6))
plt.hist(delta_motor, bins=30, color='teal', edgecolor='black')
plt.title("Δ Motor UPDRS Over 30-Day Windows (Training Set)")
plt.xlabel("Change in Motor UPDRS")
plt.ylabel("Frequency")
plt.tight_layout()
plt.show()
The continuous motor_UPDRS scores were converted into binary severity labels:
These labels were then encoded into integers using LabelEncoder to prepare for classification modeling. Class distribution in y_train_encoded was checked to verify label balance after encoding.
# 1. Drop duplicates to count per subject
train_unique = df.iloc[train_idx].drop_duplicates("subject#")
# 2. Define severity bins and count per group
severity_df = pd.DataFrame([
{
"Severity Level": label,
"Motor UPDRS Count": train_unique.query(f"motor_UPDRS >= {low} and motor_UPDRS <= {high}").shape[0]
}
for label, (low, high) in {"Mild": (0, 20), "Moderate+": (21, 40)}.items()
])
# 3. Plot bar chart
plt.figure(figsize=(4, 5))
ax = sns.barplot(data=severity_df, x="Severity Level", y="Motor UPDRS Count", palette="pastel")
for p in ax.patches:
ax.text(p.get_x() + p.get_width() / 2, p.get_height(), f"{int(p.get_height())}",
ha="center", va="bottom", fontweight="bold")
plt.title("Motor UPDRS Severity Level Distribution (Train Set)")
plt.ylabel("Number of Subjects")
plt.xlabel("Severity Level")
plt.tight_layout()
plt.show()
# 4. Convert to binary labels + encode
le = LabelEncoder()
y_train_encoded = le.fit_transform(y_train.apply(lambda x: "Mild" if x <= 20 else "Moderate+"))
y_test_encoded = le.transform(y_test.apply(lambda x: "Mild" if x <= 20 else "Moderate+"))
# 5. Show class counts
print("Class Distribution in y_train (Binary):")
print(pd.Series(y_train_encoded).value_counts())
Class Distribution in y_train (Binary): 1 2554 0 2074 dtype: int64
base_features = [
"age", "sex", "Jitter(%)", "Jitter(Abs)", "Jitter:RAP", "Jitter:PPQ5", "Jitter:DDP",
"Shimmer", "Shimmer(dB)", "Shimmer:APQ3", "Shimmer:APQ5", "Shimmer:APQ11", "Shimmer:DDA",
"NHR", "HNR", "RPDE", "DFA", "PPE"]
def engineer_features(df):
df = df.copy()
# Derived features
df["log_PPE"] = np.log1p(df["PPE"])
df["Inv_JitterAbs"] = 1 / (df["Jitter(Abs)"] + 1e-6)
df["Shimmer_Slope"] = df["Shimmer:APQ11"] - (df["Shimmer:APQ3"] + 1e-6)
df["Age_Sex_Interaction"] = df["age"] * df["sex"]
return df
# Transformer to use in pipeline
engineer_transformer = FunctionTransformer(engineer_features, validate=False)
# Based on correlation and distribution EDA and Avoiding redundancy
final_features = [
"age", "sex", "Shimmer_Slope", "Inv_JitterAbs",
"log_PPE", "RPDE", "DFA", "Age_Sex_Interaction"
]
select_features_transformer = FunctionTransformer(lambda df: df[final_features], validate=False)
The following features were selected based on clinical relevance, exploratory analysis, and engineered signal strength:
These features balance interpretability and predictive value, making them suitable for modeling motor symptom severity.
def get_scaler():
return RobustScaler()
Three classification models (Logistic Regression, Random Forest, and XGBoost) were wrapped in Pipeline objects, each with 'RobustScaler' applied in pipeline step.
This ensures consistent feature scaling before model fitting.
'StratifiedGroupKFold' was used to perform 5-fold cross-validation, preserving both class balance and group independence across folds.
For each model, predictions were collected across all validation folds and combined to evaluate performance.
The final classification report for each model summarizes overall precision, recall, and F1-score across all folds using the encoded severity labels.
# Define group labels for cross-validation
groups_train = df.iloc[train_idx]["subject#"]
from sklearn.decomposition import PCA
# Define model pipelines with PCA step added
clf_pipelines = {
"Logistic": Pipeline([
("engineer", engineer_transformer),
("select", select_features_transformer),
("scale", get_scaler()),
("clf", LogisticRegression(max_iter=1000, class_weight='balanced'))
]),
"RandomForest": Pipeline([
("engineer", engineer_transformer),
("select", select_features_transformer),
("scale", get_scaler()),
("clf", RandomForestClassifier(n_estimators=100, class_weight='balanced', random_state=42))
]),
"XGBoost": Pipeline([
("engineer", engineer_transformer),
("select", select_features_transformer),
("scale", get_scaler()),
("clf", XGBClassifier(n_estimators=100, use_label_encoder=False, eval_metric='logloss', random_state=42))
])
}
# Stratified group-aware cross-validation
cv = StratifiedGroupKFold(n_splits=5)
cv_clf_results_all = {}
# Evaluate each classifier
for model_name, pipeline in clf_pipelines.items():
clf_preds = []
for tr_idx, val_idx in cv.split(X_train, y_train_encoded, groups=groups_train):
X_tr, X_val = X_train.iloc[tr_idx], X_train.iloc[val_idx]
y_tr, y_val = y_train_encoded[tr_idx], y_train_encoded[val_idx]
pipeline.fit(X_tr, y_tr)
y_pred = pipeline.predict(X_val)
fold_df = pd.DataFrame({
"Actual_Severity": y_val,
"Predicted_Severity": y_pred
}, index=X_val.index)
clf_preds.append(fold_df)
# Store aggregated results for this model
cv_clf_results_all[model_name] = pd.concat(clf_preds, ignore_index=True)
# Print out performance metrics
for name, result_df in cv_clf_results_all.items():
print(f"\n{name} Classification Performance:")
print(classification_report(result_df["Actual_Severity"], result_df["Predicted_Severity"], target_names=le.classes_))
Logistic Classification Performance:
precision recall f1-score support
Mild 0.52 0.55 0.53 2074
Moderate+ 0.61 0.59 0.60 2554
accuracy 0.57 4628
macro avg 0.57 0.57 0.56 4628
weighted avg 0.57 0.57 0.57 4628
RandomForest Classification Performance:
precision recall f1-score support
Mild 0.38 0.40 0.39 2074
Moderate+ 0.49 0.47 0.48 2554
accuracy 0.44 4628
macro avg 0.44 0.43 0.43 4628
weighted avg 0.44 0.44 0.44 4628
XGBoost Classification Performance:
precision recall f1-score support
Mild 0.43 0.49 0.46 2074
Moderate+ 0.53 0.47 0.50 2554
accuracy 0.48 4628
macro avg 0.48 0.48 0.48 4628
weighted avg 0.49 0.48 0.48 4628
from sklearn.metrics import precision_score, recall_score, f1_score, accuracy_score, confusion_matrix
summary_metrics = []
# Loop over the collected CV results
for model_name, result_df in cv_clf_results_all.items():
y_true = result_df["Actual_Severity"]
y_pred = result_df["Predicted_Severity"]
# Confusion matrix components
tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
summary_metrics.append({
"Model": model_name,
"Accuracy": round(accuracy_score(y_true, y_pred), 3),
"Precision (Macro)": round(precision_score(y_true, y_pred, average="macro"), 3),
"Recall (Macro)": round(recall_score(y_true, y_pred, average="macro"), 3),
"F1-Score (Macro)": round(f1_score(y_true, y_pred, average="macro"), 3),
"Sensitivity (TPR)": round(tp / (tp + fn), 3),
"Specificity (TNR)": round(tn / (tn + fp), 3)
})
# Create and display summary DataFrame
comparison_df = pd.DataFrame(summary_metrics).sort_values(by="F1-Score (Macro)", ascending=False)
print("\nModel Comparison Summary (Cross-Validation):")
display(comparison_df)
Model Comparison Summary (Cross-Validation):
| Model | Accuracy | Precision (Macro) | Recall (Macro) | F1-Score (Macro) | Sensitivity (TPR) | Specificity (TNR) | |
|---|---|---|---|---|---|---|---|
| 0 | Logistic | 0.568 | 0.565 | 0.566 | 0.565 | 0.586 | 0.545 |
| 2 | XGBoost | 0.482 | 0.483 | 0.482 | 0.481 | 0.474 | 0.491 |
| 1 | RandomForest | 0.438 | 0.435 | 0.435 | 0.435 | 0.465 | 0.405 |
To optimize model performance, hyperparameter tuning was performed for Logistic Regression, Random Forest, and XGBoost using 'GridSearchCV'.
Each model was wrapped in a 'Pipeline' with a 'RobustScaler' to ensure consistent preprocessing.
The 'tune_model' function takes a model pipeline, parameter grid, training data, and group-aware cross-validation strategy ('StratifiedGroupKFold').
It evaluates each parameter combination using 5-fold cross-validation, scoring by macro-averaged F1-score to handle class imbalance.
Custom parameter grids were defined for each model, and the best estimator (i.e., best set of parameters) was selected and stored for future evaluation.
# === 1. Function to run GridSearchCV on a pipeline ===
def tune_model(pipeline, param_grid, X, y, groups, cv):
grid_search = GridSearchCV(
estimator=pipeline,
param_grid=param_grid,
cv=cv,
scoring='f1_macro',
n_jobs=-1,
verbose=1
)
grid_search.fit(X, y, groups=groups)
return grid_search
# === 2. Define parameter grids for each model ===
param_grids = {
"Logistic": {
"clf__C": [0.01, 0.1, 1, 10]
},
"RandomForest": {
"clf__n_estimators": [100, 200],
"clf__max_depth": [None, 5, 10],
"clf__min_samples_split": [2, 5]
},
"XGBoost": {
"clf__n_estimators": [100, 200],
"clf__max_depth": [3, 5],
"clf__learning_rate": [0.01, 0.1]
}
}
# === 3. Perform tuning for each model ===
best_estimators = {}
cv_scores = {}
for model_name, pipeline in clf_pipelines.items():
print(f"\nTuning {model_name}...")
grid = tune_model(pipeline, param_grids[model_name], X_train, y_train_encoded, groups_train, cv)
best_estimators[model_name] = grid.best_estimator_
cv_scores[model_name] = round(grid.best_score_, 4)
print(f"Best Params for {model_name}: {grid.best_params_}")
print(f"Best F1 Score: {cv_scores[model_name]}")
# === 4. Create combined model dictionary for training evaluation ===
tuned_models = {
"Logistic Regression": (best_estimators["Logistic"], cv_scores["Logistic"]),
"Random Forest": (best_estimators["RandomForest"], cv_scores["RandomForest"]),
"XGBoost": (best_estimators["XGBoost"], cv_scores["XGBoost"])
}
Tuning Logistic...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
Best Params for Logistic: {'clf__C': 0.01}
Best F1 Score: 0.5546
Tuning RandomForest...
Fitting 5 folds for each of 12 candidates, totalling 60 fits
Best Params for RandomForest: {'clf__max_depth': 5, 'clf__min_samples_split': 2, 'clf__n_estimators': 200}
Best F1 Score: 0.445
Tuning XGBoost...
Fitting 5 folds for each of 8 candidates, totalling 40 fits
Best Params for XGBoost: {'clf__learning_rate': 0.1, 'clf__max_depth': 5, 'clf__n_estimators': 100}
Best F1 Score: 0.4648
Each best estimator (from hyperparameter tuning) was retrained on the full training set and evaluated to assess model fit.
The evaluate_on_train function computes:
A small gap between training and CV performance indicates good generalization, while a large gap may suggest overfitting.
def evaluate_on_train(model_name, model, X_train, y_train, label_encoder, cv_f1_score=None):
print(f"\n\n=== {model_name} Performance on Training Set ===")
model.fit(X_train, y_train)
y_pred = model.predict(X_train)
print(classification_report(y_train, y_pred, target_names=label_encoder.classes_))
print("Accuracy:", round(accuracy_score(y_train, y_pred), 3))
print("\nConfusion Matrix:")
print(confusion_matrix(y_train, y_pred))
if cv_f1_score:
print("CV F1-Score (Macro):", round(cv_f1_score, 3))
# Loop through models with F1-score
for model_name, (model, cv_f1) in tuned_models.items():
evaluate_on_train(model_name, model, X_train, y_train_encoded, le, cv_f1_score=cv_f1)
=== Logistic Regression Performance on Training Set ===
precision recall f1-score support
Mild 0.62 0.72 0.67 2074
Moderate+ 0.74 0.65 0.69 2554
accuracy 0.68 4628
macro avg 0.68 0.68 0.68 4628
weighted avg 0.69 0.68 0.68 4628
Accuracy: 0.68
Confusion Matrix:
[[1488 586]
[ 896 1658]]
CV F1-Score (Macro): 0.555
=== Random Forest Performance on Training Set ===
precision recall f1-score support
Mild 0.74 0.91 0.81 2074
Moderate+ 0.91 0.73 0.81 2554
accuracy 0.81 4628
macro avg 0.82 0.82 0.81 4628
weighted avg 0.83 0.81 0.81 4628
Accuracy: 0.812
Confusion Matrix:
[[1884 190]
[ 679 1875]]
CV F1-Score (Macro): 0.445
=== XGBoost Performance on Training Set ===
precision recall f1-score support
Mild 0.95 0.98 0.97 2074
Moderate+ 0.99 0.96 0.97 2554
accuracy 0.97 4628
macro avg 0.97 0.97 0.97 4628
weighted avg 0.97 0.97 0.97 4628
Accuracy: 0.969
Confusion Matrix:
[[2038 36]
[ 109 2445]]
CV F1-Score (Macro): 0.465
This table summarizes the performance of all three tuned models on the training set using key classification metrics.
It includes macro-averaged scores, sensitivity (recall for Moderate+), specificity (recall for Mild), and cross-validated F1-scores for comparison.
train_summary_metrics = []
for model_name, (model, cv_f1_score_val) in tuned_models.items():
model.fit(X_train, y_train_encoded)
y_pred = model.predict(X_train)
acc = accuracy_score(y_train_encoded, y_pred)
precision = precision_score(y_train_encoded, y_pred, average="macro")
recall = recall_score(y_train_encoded, y_pred, average="macro")
f1 = f1_score(y_train_encoded, y_pred, average="macro")
tn, fp, fn, tp = confusion_matrix(y_train_encoded, y_pred).ravel()
sensitivity = tp / (tp + fn)
specificity = tn / (tn + fp)
train_summary_metrics.append({
"Model": model_name,
"Accuracy": round(acc, 3),
"Precision (Macro)": round(precision, 3),
"Recall (Macro)": round(recall, 3),
"F1-Score (Macro)": round(f1, 3),
"Sensitivity (TPR)": round(sensitivity, 3),
"Specificity (TNR)": round(specificity, 3),
"CV F1-Score": round(cv_f1_score_val, 3)
})
# === 3. Display comparison table ===
comparison_df = pd.DataFrame(train_summary_metrics).sort_values(by="F1-Score (Macro)", ascending=False)
print("\n=== Tuned Models: Training Performance Summary ===")
display(comparison_df)
=== Tuned Models: Training Performance Summary ===
| Model | Accuracy | Precision (Macro) | Recall (Macro) | F1-Score (Macro) | Sensitivity (TPR) | Specificity (TNR) | CV F1-Score | |
|---|---|---|---|---|---|---|---|---|
| 2 | XGBoost | 0.969 | 0.967 | 0.970 | 0.968 | 0.957 | 0.983 | 0.465 |
| 1 | Random Forest | 0.812 | 0.822 | 0.821 | 0.812 | 0.734 | 0.908 | 0.445 |
| 0 | Logistic Regression | 0.680 | 0.682 | 0.683 | 0.679 | 0.649 | 0.717 | 0.555 |
The best-performing model from tuning (Logistic Regression) was used to make predictions on the held-out test set.
The following metrics were computed:
This evaluation provides an unbiased estimate of real-world model performance, as the test set was never used during training or tuning.
def evaluate_model_on_test(model, X_test, y_test, label_encoder, model_name="Model"):
# Predict on test set
y_pred = model.predict(X_test)
# Compute evaluation metrics
acc = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average="macro")
recall = recall_score(y_test, y_pred, average="macro")
f1 = f1_score(y_test, y_pred, average="macro")
# Handle confusion matrix safely
cm = confusion_matrix(y_test, y_pred)
tn, fp, fn, tp = cm.ravel() if cm.shape == (2, 2) else (0, 0, 0, 0)
sensitivity = tp / (tp + fn) if (tp + fn) > 0 else 0
specificity = tn / (tn + fp) if (tn + fp) > 0 else 0
# Display detailed performance report
print(f"=== {model_name} – Held-Out Test Set Performance ===")
print(classification_report(y_test, y_pred, target_names=label_encoder.classes_))
print(f"Accuracy: {round(acc, 3)}")
print(f"Precision (Macro): {round(precision, 3)}")
print(f"Recall (Macro): {round(recall, 3)}")
print(f"F1-Score (Macro): {round(f1, 3)}")
print(f"Sensitivity (TPR): {round(sensitivity, 3)}")
print(f"Specificity (TNR): {round(specificity, 3)}")
print("\nConfusion Matrix:")
print(cm)
return y_pred
# Evaluating on the Logistic Regression model on the test set
evaluate_model_on_test(
model=best_estimators["Logistic"],
X_test=X_test, # raw test set (will go through pipeline inside)
y_test=y_test_encoded,
label_encoder=le,
model_name="Logistic Regression"
)
=== Logistic Regression – Held-Out Test Set Performance ===
precision recall f1-score support
Mild 0.63 0.69 0.66 680
Moderate+ 0.57 0.50 0.53 555
accuracy 0.60 1235
macro avg 0.60 0.60 0.60 1235
weighted avg 0.60 0.60 0.60 1235
Accuracy: 0.605
Precision (Macro): 0.599
Recall (Macro): 0.595
F1-Score (Macro): 0.595
Sensitivity (TPR): 0.501
Specificity (TNR): 0.69
Confusion Matrix:
[[469 211]
[277 278]]
array([1, 1, 1, ..., 1, 1, 1])
To interpret how each feature contributes to the prediction, the model coefficients from the best Logistic Regression estimator were extracted.
These coefficients indicate the direction and strength of influence each feature has on the predicted severity class.
# === Access logistic model from pipeline ===
# Ensure you're using the correct best estimator (not redefined)
logistic_model = best_estimators["Logistic"].named_steps["clf"]
# === Extract coefficients for each feature ===
logit_coefs = logistic_model.coef_[0]
# === Plotting feature importance based on coefficients ===
plt.figure(figsize=(7, 4))
plt.barh(final_features, logit_coefs)
plt.xlabel("Coefficient Value")
plt.ylabel("Feature")
plt.title("Logistic Regression Feature Importance (Raw Coefficients)")
plt.grid(True)
plt.tight_layout()
plt.show()
Positive coefficients indicate features associated with higher severity (Moderate+), while negative values are linked to the Mild class.
This helps identify which vocal and clinical factors are most influential in the model's decision-making.
# Get preprocessing pipeline from best estimator
preprocessing = best_estimators["Logistic"].named_steps
# Transform training and test data manually
X_train_final = preprocessing["scale"].transform(
preprocessing["select"].transform(
preprocessing["engineer"].transform(X_train.copy())
)
)
X_test_final = preprocessing["scale"].transform(
preprocessing["select"].transform(
preprocessing["engineer"].transform(X_test.copy())
)
)
SHAP (SHapley Additive exPlanations) is used to interpret how each feature contributes to the model's prediction. This method is based on cooperative game theory and provides both global and individual-level explanations.
In this section, we:
These plots help identify which features consistently drive predictions toward the "Mild" or "Moderate+" class.
# 1. Create SHAP explainer for linear model
explainer = shap.Explainer(logistic_model, X_train_final, feature_names=final_features)
# 2. Compute SHAP values for the held-out test set
shap_values = explainer(X_test_final)
# 3. Global Feature Importance (Bar plot)
shap.summary_plot(shap_values, X_test_final, plot_type="bar", show=True)
# 4. Individual-Level Impact (Beeswarm plot)
shap.summary_plot(shap_values, X_test_final, show=True)
Age Highest global impact on severity classification. Older individuals more likely predicted as Moderate+. Consistent with disease progression trends.
DFA (Detrended Fluctuation Analysis) High values reduce severity prediction. Reflects more stable vocal dynamics, typical of milder symptoms.
Shimmer_Slope Strong indicator of amplitude instability in voice. High values push prediction toward Moderate+.
Age_Sex_Interaction Moderate influence. Suggests age and sex jointly affect vocal degradation patterns.
log_PPE Higher pitch entropy = more vocal irregularity. Indicates impaired neuromuscular control seen in PD.
RPDE Subtle but meaningful. Captures voice periodicity, helps differentiate between healthy and impaired speech.
Sex Plays a role, but secondary to core acoustic markers.
Inv_JitterAbs New inverse jitter-based feature. Lower jitter (higher Inv_JitterAbs) contributes to Mild prediction. Acts as a stability marker in speech.
To better understand the model's limitations, we analyzed SHAP values specifically for misclassified test cases (i.e., where the predicted severity class didn't match the actual one). This helps identify which features were most misleading or contributed to confusion.
# 1. Predict on already-transformed X_test_final (no need to transform again)
logistic_model = best_estimators["Logistic"].named_steps["clf"]
y_test_pred = logistic_model.predict(X_test_final)
# 2. Identify misclassified samples
misclassified_mask = (y_test_pred != y_test_encoded)
# 3. Create SHAP explainer on training set
explainer = shap.Explainer(logistic_model, X_train_final, feature_names=final_features)
# 4. Compute SHAP values on the test set
shap_values = explainer(X_test_final)
# 5. Filter SHAP values for misclassified samples
shap_misclassified = shap.Explanation(
values=shap_values.values[misclassified_mask],
base_values=shap_values.base_values[misclassified_mask],
data=X_test_final[misclassified_mask],
feature_names=final_features
)
# 6. Plot beeswarm for misclassified
shap.summary_plot(shap_misclassified, X_test_final[misclassified_mask], plot_type="dot", show=True)
DFA Still dominant among errors. High values sometimes lead to incorrect Mild classification.
Shimmer_Slope High variability persists. Contributes both positively and negatively — inconsistent across individuals.
Age_Sex_Interaction Moderate role. Interactions seem to add ambiguity in borderline predictions.
log_PPE and RPDE Overlapping contributions across classes. May lack discriminative power alone in difficult cases.
Sex Lower influence overall, but shows direction flips. Potentially interacts with acoustic features in subtle ways.
Inv_JitterAbs Low inverse jitter still drives some Moderate+ predictions. Impact less clear in misclassifications.
To improve interpretability at the individual prediction level, LIME (Local Interpretable Model-Agnostic Explanations) was applied to the best Logistic Regression model.
LIME explains a single prediction by approximating the model locally with an interpretable surrogate (e.g., linear model) and identifies the most influential features for each case.
# Initialize the explainer
lime_explainer = lime.lime_tabular.LimeTabularExplainer(
training_data=X_train_final, # NumPy array
feature_names=final_features, # Feature names after transform
class_names=le.classes_, # Class labels ("Mild", "Moderate+")
mode='classification',
discretize_continuous=True
)
# Pick a few indices to explain
sample_indices = np.random.choice(X_test_final.shape[0], 3, replace=False)
# Explain and display each
for i, idx in enumerate(sample_indices):
print(f"\nLIME Explanation for Test Sample Index: {idx}")
exp = lime_explainer.explain_instance(
data_row=X_test_final[idx],
predict_fn=logistic_model.predict_proba,
num_features=8
)
exp.show_in_notebook(show_table=True) # Or save as HTML
# exp.save_to_file(f"lime_explanation_{i+1}.html")
LIME Explanation for Test Sample Index: 155
LIME Explanation for Test Sample Index: 1227
LIME Explanation for Test Sample Index: 335
Sample 155
Sample 1227
Sample 335